import os
import sys
import warnings
import logging
from pathlib import Path
from dotenv import load_dotenv
import polars as pl
from inspect_ai import Task, task, eval
from inspect_ai.dataset import Sample
from inspect_ai.scorer import Score, scorer, Scorer, CORRECT, INCORRECT, PARTIAL
from inspect_ai.solver import generate, Solver, solver
from inspect_ai.model import get_model
# Suppress all warnings and async-related errors
warnings.filterwarnings('ignore')
logging.getLogger('asyncio').setLevel(logging.CRITICAL)
# Fix for nested asyncio in Jupyter/Quarto
import nest_asyncio
nest_asyncio.apply()
# Load environment variables
load_dotenv()
# Verify API key is available
assert os.getenv("ANTHROPIC_API_KEY"), "ANTHROPIC_API_KEY not found in .env"Evaluating Great Tables Code Generation with chatlas and inspect_ai
Introduction
This document evaluates LLM-generated Great Tables code using the chatlas Python package and Inspect for systematic evaluation.
Instead of manually running prompts and checking if code works, we’ll:
- define test cases for the first 3 examples (API Latency, Tech Salaries, GitHub Repos)
- use chatlas with Inspect to automatically generate code with Claude
- use Inspect to judge the quality of generated code: poor, fair, or great
- compare results with and without API documentation
Setup
Test Case Definitions
We’ll test the same three examples from the manual analysis, focusing on the ones that showed clear improvement with API documentation.
Example 1: API Latency Dashboard
Simple formatting and conditional styling.
# Define test prompts (success = code runs without errors)
example1_prompt = """Load the API latency data from `data/api_latency.csv` using Polars, then create a Great Tables table that:
- formats the millisecond columns with comma separators
- formats the requests column with comma separators
- formats the errors column with comma separators
- highlights rows where p99_ms is over 1000ms in light red
- adds a title "API Performance Dashboard"
- sorts by p99_ms descending
- puts the endpoint column in the stub
The CSV has columns: endpoint, method, avg_ms, p95_ms, p99_ms, requests, errors"""Example 2: Tech Salaries
Row grouping, currency formatting, color gradients.
example2_prompt = """Using Great Tables, create a table from `data/tech_salaries.csv` that:
- Groups rows by role
- Formats all salary/compensation columns as currency (no cents)
- Colors the total_comp column using a gradient from white to green, with higher values darker
- Shows the location in a smaller, gray font
- Adds column spanners: "Role" for role/level, "Compensation" for the money columns, "Details" for yoe/location
The CSV has columns: role, level, base_salary, bonus, equity, total_comp, yoe, location"""Example 3: GitHub Repository Stats
More complex, with multi-column formatting, bar charts, and conditional styling.
example3_prompt = """Create a Great Tables table from `data/github_repos.csv` that:
- Formats stars with K suffix (e.g., 23.4K)
- Formats forks with comma separators
- Creates a bar chart in the stars column (using nanoplot)
- Colors the last_commit_days column: green for <3, yellow for 3-7, red for >7
- Adds a spanner "Issues" over issues_open and issues_closed
- Adds a title "GitHub Repository Metrics"
The CSV has columns: repo, language, stars, forks, issues_open, issues_closed, prs_merged, last_commit_days"""Defining API Documentation Context
We’ll fetch the API documentation from Great Tables’ /.well-known/llms.txt file and add it to prompts to test the impact of providing reference material.
import httpx
# Fetch API documentation from Great Tables llms.txt
response = httpx.get("https://posit-dev.github.io/great-tables/.well-known/llms.txt")
api_context = response.textDefining the Code Quality Scorer
We’ll create an Inspect scorer that evaluates the generated code and rates it as ‘poor’, ‘fair’, or ‘great’.
import subprocess
import tempfile
from pathlib import Path
from chatlas import ChatAnthropic
def execute_code(code: str) -> tuple[bool, str]:
"""
Execute generated code and return success status and any error messages.
Returns:
(success: bool, error_or_output: str)
"""
# Extract code from markdown fences if present
code = code.strip()
# Look for code blocks between ```python and ```
import re
code_block_pattern = r'```(?:python)?\s*\n(.*?)```'
matches = re.findall(code_block_pattern, code, re.DOTALL)
if matches:
# Use the first (or largest) code block found
code = max(matches, key=len).strip()
else:
# Try to strip leading/trailing markdown fences manually
if code.startswith("```python"):
code = code[len("```python"):].lstrip()
elif code.startswith("```"):
code = code[3:].lstrip()
if code.endswith("```"):
code = code[:-3].rstrip()
# Ensure necessary imports are present - prepend if missing
required_imports = [
"from great_tables import GT, md, html, loc, style",
"import polars as pl"
]
# Check if imports are already in the code
has_gt_import = "from great_tables import" in code or "import great_tables" in code
has_pl_import = "import polars" in code or "import pandas" in code
# Prepend missing imports
import_block = []
if not has_gt_import:
import_block.append(required_imports[0])
if not has_pl_import:
import_block.append(required_imports[1])
if import_block:
code = "\n".join(import_block) + "\n\n" + code
# Create a temporary file with the code
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
f.write(code)
temp_file = f.name
try:
# Try to run the code - use the current working directory
result = subprocess.run(
['python', temp_file],
capture_output=True,
text=True,
timeout=10,
cwd=str(Path.cwd())
)
if result.returncode == 0:
return True, "Code executed successfully"
else:
return False, result.stderr
except subprocess.TimeoutExpired:
return False, "Timeout: code took too long to execute"
except Exception as e:
return False, str(e)
finally:
# Clean up
Path(temp_file).unlink(missing_ok=True)
@scorer(metrics=[])
def code_quality_scorer():
"""
Score generated Great Tables code as poor, fair, or great based on:
- Poor: Missing key features or completely wrong approach
- Fair: Has most requested features but may have minor issues or missing some details
- Great: Implements all requested features correctly
Note: We judge based on code content, not just execution success.
"""
async def score(state, target):
# Get the generated code from the model's response
code = state.output.completion
# Always use LLM judge to evaluate the code quality based on features
# Don't let execution failures automatically result in "poor" rating
judge_prompt = (
"You are evaluating Python code that uses the Great Tables library.\n\n"
f"Original Request: {target}\n\n"
"Generated Code:\n"
f"```python\n{code}\n```\n\n"
"Evaluate the code based on whether it contains the requested features, "
"not whether it runs perfectly. Rate as:\n"
"- GREAT if it implements all requested features with correct method calls\n"
"- FAIR if it has most features but is missing some details or has minor issues\n"
"- POOR if it's missing key features or uses completely wrong approaches\n\n"
"Focus on: Does the code attempt to use the right Great Tables methods? "
"Are the requested formatting, styling, and structural features present?\n\n"
"Provide your rating (GREAT, FAIR, or POOR) and a brief explanation."
)
# Create a simple judge using chatlas
judge = ChatAnthropic(model="claude-sonnet-4-5")
judgment = judge.chat(judge_prompt)
# Parse the judgment
judgment_upper = judgment.content.upper()
if "GREAT" in judgment_upper and "FAIR" not in judgment_upper.split("GREAT")[0]:
return Score(
value=2,
answer="great",
explanation=judgment.content,
metadata={"quality": "great"}
)
elif "FAIR" in judgment_upper:
return Score(
value=1,
answer="fair",
explanation=judgment.content,
metadata={"quality": "fair"}
)
else:
return Score(
value=0,
answer="poor",
explanation=judgment.content,
metadata={"quality": "poor"}
)
return scoreCreating Inspect Tasks
Now let’s create Inspect tasks for systematic evaluation.
# Create samples for each test case
samples_no_docs = [
Sample(
input=example1_prompt,
target="API Latency Dashboard with formatting and conditional styling",
id="example1_no_docs"
),
Sample(
input=example2_prompt,
target="Tech Salaries table with row grouping, currency formatting, and color gradients",
id="example2_no_docs"
),
Sample(
input=example3_prompt,
target="GitHub Repository Stats with nanoplots, multi-column formatting, and complex styling",
id="example3_no_docs"
)
]
samples_with_docs = [
Sample(
input=example1_prompt + "\n\n" + api_context,
target="API Latency Dashboard with formatting and conditional styling",
id="example1_with_docs"
),
Sample(
input=example2_prompt + "\n\n" + api_context,
target="Tech Salaries table with row grouping, currency formatting, and color gradients",
id="example2_with_docs"
),
Sample(
input=example3_prompt + "\n\n" + api_context,
target="GitHub Repository Stats with nanoplots, multi-column formatting, and complex styling",
id="example3_with_docs"
)
]
@task
def eval_no_docs():
return Task(
dataset=samples_no_docs,
solver=generate(),
scorer=code_quality_scorer(),
system_message="You are a Python expert. Generate ONLY executable Python code using Polars (not Pandas) for data manipulation. Do not include any explanations, markdown formatting, or text before or after the code. Output pure Python code that can be run directly. Always use `import polars as pl` and `pl.read_csv()` for reading CSV files."
)
@task
def eval_with_docs():
return Task(
dataset=samples_with_docs,
solver=generate(),
scorer=code_quality_scorer(),
system_message="You are a Python expert. Generate ONLY executable Python code using Polars (not Pandas) for data manipulation. Do not include any explanations, markdown formatting, or text before or after the code. Output pure Python code that can be run directly. Use the Great Tables API documentation provided in the prompt. Always use `import polars as pl` and `pl.read_csv()` for reading CSV files."
)Running Evaluations
The last thing to do is eval() the two bundles of test cases.
import io
import sys
from contextlib import redirect_stderr, redirect_stdout
# Capture all output to suppress async warnings/errors
captured_stderr = io.StringIO()
captured_stdout = io.StringIO()
with redirect_stderr(captured_stderr), redirect_stdout(captured_stdout):
results_no_docs = eval(
eval_no_docs(),
model="anthropic/claude-sonnet-4-5"
)
results_with_docs = eval(
eval_with_docs(),
model="anthropic/claude-sonnet-4-5"
)
# Print only the summary lines without errors
print("✓ Evaluation completed successfully")Results Summary Table
Now let’s create a Great Tables table to visualize the quality ratings.
from great_tables import GT
# Extract results from Inspect evaluations
results_list = []
for i, sample_id in enumerate(["example1", "example2", "example3"]):
example_names = [
"1. API Latency Dashboard",
"2. Tech Salaries",
"3. GitHub Repository Stats"
]
# Get scores from results - use 'scores' dict instead of deprecated 'score'
no_docs_sample = results_no_docs[0].samples[i] if results_no_docs else None
with_docs_sample = results_with_docs[0].samples[i] if results_with_docs else None
# Extract quality from metadata or answer
no_docs_score = (
no_docs_sample.scores['code_quality_scorer'].metadata.get("quality", no_docs_sample.scores['code_quality_scorer'].answer)
if no_docs_sample and no_docs_sample.scores and 'code_quality_scorer' in no_docs_sample.scores
else "error"
)
with_docs_score = (
with_docs_sample.scores['code_quality_scorer'].metadata.get("quality", with_docs_sample.scores['code_quality_scorer'].answer)
if with_docs_sample and with_docs_sample.scores and 'code_quality_scorer' in with_docs_sample.scores
else "error"
)
results_list.append({
"Example": example_names[i],
"Without API Docs": no_docs_score.title() if isinstance(no_docs_score, str) else "Error",
"With API Docs": with_docs_score.title() if isinstance(with_docs_score, str) else "Error"
})
results_df = pl.DataFrame(results_list)
# Create the Great Tables table
(
GT(results_df)
.tab_header(
title="LLM Code Generation Quality Ratings",
subtitle="Claude Sonnet 4.5 tested on Great Tables code generation"
)
.data_color(
columns=["Without API Docs", "With API Docs"],
palette=["#FF6B6B", "#FFD93D", "#6BCF7F"],
domain=["Poor", "Fair", "Great"]
)
.cols_label(
Example="Test Case"
)
.tab_source_note(
source_note="Quality ratings: Poor (doesn't run) | Fair (runs but incomplete) | Great (runs and implements all features)"
)
)| LLM Code Generation Quality Ratings | ||
| Claude Sonnet 4.5 tested on Great Tables code generation | ||
| Test Case | Without API Docs | With API Docs |
|---|---|---|
| 1. API Latency Dashboard | Great | Great |
| 2. Tech Salaries | Great | Great |
| 3. GitHub Repository Stats | Great | Great |
| Quality ratings: Poor (doesn't run) | Fair (runs but incomplete) | Great (runs and implements all features) | ||
Conclusion
This analysis demonstrates how to:
- use chatlas with inspect_ai for systematic LLM evaluation
- automatically generate code from prompts
- use inspect_ai scorers to judge code quality beyond simple pass/fail
- rate generated code as poor, fair, or great based on execution and feature completeness
- systematically compare results with and without API documentation
- visualize results with Great Tables
This approach makes it practical to validate that documentation actually helps LLMs not just generate working code, but generate high-quality code. We can test different prompting strategies systematically and ensure API changes don’t break common use cases.